0. Data used:¶

  • Transfermarkt: https://www.kaggle.com/datasets/davidcariboo/player-scores

1. Data Loading¶

In [107]:
# Import modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots
In [108]:
import plotly, plotly.io as pio
from kaleido.scopes.plotly import PlotlyScope

pio.kaleido.scope = PlotlyScope()

pio.renderers.default = "notebook"
In [109]:
# Define data path
data_path = "../data"

# Load data
try:
    df_tm_clubs = pd.read_csv(f"{data_path}/tm_clubs.csv")
    df_tm_transfers = pd.read_csv(f"{data_path}/tm_transfers.csv")
    df_tm_competitions = pd.read_csv(f"{data_path}/tm_competitions.csv")
    print("Files loaded successfully")
except FileNotFoundError:
    print(f"Error: One or more files at {data_path} were not found.")
    exit()
Files loaded successfully

2. Data Cleaning¶

2.1. Data transformation¶

In [110]:
# Modification of trasnfer_season column to preserve order
def expand_season(season_str):
    start_year = int(season_str[:2])
    end_year = int(season_str[3:])

    if start_year < 70:  # Consider that are 20xx
        start_full = 2000 + start_year
        end_full = 2000 + end_year
    else:  # Consider that are 19xx
        start_full = 1900 + start_year
        end_full = 1900 + end_year

    return f"{start_full}/{str(end_full)[-2:]}"

df_tm_transfers['transfer_season'] = df_tm_transfers['transfer_season'].apply(expand_season) 
In [111]:
# Convert transfer_date into datetime type
df_tm_transfers['transfer_date'] = pd.to_datetime(df_tm_transfers['transfer_date'], errors='coerce')

# Remove data from 2025 onward
df_tm_transfers = df_tm_transfers[
    df_tm_transfers['transfer_date'] <= pd.Timestamp('2024-12-31')
].copy()
In [112]:
# Still remain noisy data from 2025 onward
df_tm_transfers[
    df_tm_transfers['transfer_season'].str.split('/').str[0].astype(int) >= 2025
]
Out[112]:
player_id transfer_date transfer_season from_club_id to_club_id from_club_name to_club_name transfer_fee market_value_in_eur player_name
5950 1027067 2024-07-01 2025/26 67278 67279 St. Johnst. U18 St. Johnst. B NaN NaN Bayley Klimionek
6754 551752 2024-06-30 2025/26 8970 416 Frosinone Torino 0.0 2000000.0 Demba Seck
In [113]:
# Complete the cleaning process
df_tm_transfers = df_tm_transfers[
    df_tm_transfers['transfer_season'].str.split('/').str[0].astype(int) <= 2024
].copy()

df_tm_transfers[
    df_tm_transfers['transfer_season'].str.split('/').str[0].astype(int) >= 2025
]
Out[113]:
player_id transfer_date transfer_season from_club_id to_club_id from_club_name to_club_name transfer_fee market_value_in_eur player_name

2.2. Cleaning Missing Values¶

In [114]:
# Check missing values in the dataset
missing_values = df_tm_transfers.isnull().sum()

# Display the missing values for each column
missing_values
Out[114]:
player_id                  0
transfer_date              0
transfer_season            0
from_club_id               0
to_club_id                 0
from_club_name             0
to_club_name               0
transfer_fee           27512
market_value_in_eur    30211
player_name                0
dtype: int64
In [115]:
# Check how many rows have missing values in both columns, transfer_fee and market_value_in_eur
missing_both_values = df_tm_transfers[
    df_tm_transfers['transfer_fee'].isnull() & df_tm_transfers['market_value_in_eur'].isnull()
]

# Show the result
len(missing_both_values)
Out[115]:
19383
In [116]:
# Check missing values by year
missing_by_year = (
    df_tm_transfers
    .groupby(df_tm_transfers['transfer_date'].dt.year)[['transfer_fee', 'market_value_in_eur']]
    .apply(lambda group: pd.Series({
        'missing_transfer_fee': group['transfer_fee'].isnull().sum(),
        'missing_market_value': group['market_value_in_eur'].isnull().sum(),
        'missing_both_values': (
            group['transfer_fee'].isnull() &
            group['market_value_in_eur'].isnull()
        ).sum()
    }))
    .reset_index()
    .rename(columns={'transfer_date': 'year'})
)

missing_by_year
Out[116]:
year missing_transfer_fee missing_market_value missing_both_values
0 1993 0 1 0
1 1994 2 3 2
2 1995 0 1 0
3 1996 2 3 2
4 1997 4 6 4
5 1998 6 16 6
6 1999 17 22 17
7 2000 17 30 17
8 2001 30 69 30
9 2002 42 77 42
10 2003 71 122 71
11 2004 120 199 120
12 2005 160 282 154
13 2006 244 382 234
14 2007 356 542 336
15 2008 452 613 406
16 2009 586 798 518
17 2010 717 990 625
18 2011 870 1220 736
19 2012 1058 1417 894
20 2013 1197 1587 986
21 2014 1328 1748 1075
22 2015 1545 1951 1234
23 2016 1624 2006 1294
24 2017 1908 2167 1414
25 2018 2072 2377 1534
26 2019 2189 2492 1587
27 2020 2454 2629 1801
28 2021 2323 2231 1509
29 2022 2320 1990 1367
30 2023 1934 1450 932
31 2024 1864 790 436

2.3. Missing values per season¶

In [117]:
# Create column of date for the continuous X axis
missing_by_year['year_date'] = pd.to_datetime(missing_by_year['year'].astype(str) + '-01-01')

# Convert to long format for plotly
df_long = missing_by_year.melt(
    id_vars=['year', 'year_date'],
    value_vars=['missing_transfer_fee', 'missing_market_value', 'missing_both_values'],
    var_name='metric',
    value_name='missing_count'
)

# Rename metrics for the legend
name_map = {
    'missing_transfer_fee': 'Missing Transfer Fee',
    'missing_market_value': 'Missing Market Value',
    'missing_both_values': 'Missing Both Values'
}
df_long['metric'] = df_long['metric'].map(name_map)

# Create figure
fig = px.line(
    df_long.sort_values('year_date'),
    x='year_date',
    y='missing_count',
    color='metric',
    markers=True,
    hover_data={'year': True, 'year_date': False}
)

# Layout
ordered = missing_by_year.sort_values('year_date')
fig.update_layout(
    title='Missing Values by Transfer Year',
    xaxis_title='Year',
    yaxis_title='Number of Missing Values',
    legend_title='Metric',
    template='plotly_dark',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    xaxis=dict(
        tickmode='array',
        tickvals=ordered['year_date'],
        ticktext=ordered['year'].astype(str),
        tickangle=55
    )
)

fig.show()

3. EDA (Exploratory Data Analysis)¶

3.1. Create the Cleaned Dataset by Removing Rows with Missing Values¶

In [118]:
# Create a new dataframe with the cleaned data
df_tm_transfers_cleaned = df_tm_transfers.dropna(
    subset=['transfer_fee', 'market_value_in_eur'],
    how='any'
).copy()

# Show the dataset information that have been recently cleaned
print(f"Dataset Shape: {df_tm_transfers_cleaned.shape}")
print(f"\nColumn Data Types:\n{df_tm_transfers_cleaned.dtypes}")
print(f"\nMissing Values:\n{df_tm_transfers_cleaned.isnull().sum()}")
Dataset Shape: (39378, 10)

Column Data Types:
player_id                       int64
transfer_date          datetime64[ns]
transfer_season                object
from_club_id                    int64
to_club_id                      int64
from_club_name                 object
to_club_name                   object
transfer_fee                  float64
market_value_in_eur           float64
player_name                    object
dtype: object

Missing Values:
player_id              0
transfer_date          0
transfer_season        0
from_club_id           0
to_club_id             0
from_club_name         0
to_club_name           0
transfer_fee           0
market_value_in_eur    0
player_name            0
dtype: int64
In [119]:
# Check missing values in the dataset
missing_values = df_tm_transfers_cleaned.isnull().sum()

# Display the missing values for each column
missing_values
Out[119]:
player_id              0
transfer_date          0
transfer_season        0
from_club_id           0
to_club_id             0
from_club_name         0
to_club_name           0
transfer_fee           0
market_value_in_eur    0
player_name            0
dtype: int64

3.2. Number of Transfers per Year (Cleaned Dataset)¶

In [120]:
# Count by year
counts = (
    df_tm_transfers_cleaned['transfer_date']
    .dt.year
    .value_counts()
    .rename_axis('year')
    .sort_index()
    .reset_index(name='n_transfers')
)

# 3-year rolling average
counts['roll3'] = counts['n_transfers'].rolling(3, center=True, min_periods=2).mean()

# Interactive plot
fig = px.bar(
    counts,
    x='year',
    y='n_transfers',
    color='n_transfers',
    text='n_transfers',
    title='Total Number of Transfers per Year',
    hover_data={'year': True}
)
fig.update_traces(textposition='outside', cliponaxis=False)

# Add trend line
fig.add_trace(go.Scatter(
    x=counts['year'],
    y=counts['roll3'],
    mode='lines+markers',
    name='3Y Rolling Avg',
    hovertemplate='Year: %{x}<br>3Y Rolling Avg: %{y:.0f}<extra></extra>'
))

# Style
fig.update_layout(
  legend=dict(
        orientation='v',   # vertical
        yanchor='top',
        y=1,
        xanchor='left',
        x=0
    ),
    template='plotly_dark',
    xaxis_title='Year',
    yaxis_title='Number of Transfers',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    xaxis=dict(
        tickmode='array',
        tickvals=counts['year'], 
        ticktext=counts['year'],
        tickangle=55,
        rangeslider=dict(visible=False),
        rangeselector=dict(visible=False)
    ),
    coloraxis_colorbar=dict(title='Transfers')
)

fig.show()

3.3. Numeric Distribution of Transfer Fees¶

In [121]:
# Prepare data (only valid transfer_fee)
series = pd.to_numeric(df_tm_transfers_cleaned['transfer_fee'], errors='coerce').dropna()
if series.empty:
    raise ValueError("There are no valid values in 'transfer_fee'.")

# Statistics
mean_val = series.mean()
median_val = series.median()

# Manual binning to color by intensity
nbins = 40 
counts, bin_edges = np.histogram(series, bins=nbins)
bin_left = bin_edges[:-1]
bin_right = bin_edges[1:]
bin_center = (bin_left + bin_right) / 2

df_bins = pd.DataFrame({
    'bin_left': bin_left,
    'bin_right': bin_right,
    'bin_center': bin_center,
    'count': counts
})

# Interactive plot
fig = px.bar(
    df_bins,
    x='bin_center',
    y='count',
    color='count',
    text='count',
    title='Distribution of Transfer Fees (Log Scale)',
    labels={'bin_center': 'Transfer Fee (€)', 'count': 'Count'},
    hover_data={'bin_left': ':.0f', 'bin_right': ':.0f', 'bin_center': ':.0f', 'count': ':.0f'}
)

fig.update_traces(textposition='outside', cliponaxis=False)

# Add peak annotation
idx_max = df_bins['count'].idxmax()
fig.add_annotation(
    x=df_bins.loc[idx_max, 'bin_center'],
    y=df_bins.loc[idx_max, 'count'],
    text=f"Peak<br>{int(df_bins.loc[idx_max, 'count']):,}",
    showarrow=True,
    arrowhead=2,
    yshift=10
)

# Style
fig.update_layout(
    template='plotly_dark',
    xaxis_title='Transfer Fee (€)',
    yaxis_title='Count',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    legend=dict(
        orientation='v',
        yanchor='top', y=1,
        xanchor='left', x=0
    )
)

# X axis with more compact format
fig.update_xaxes(tickformat=',.0f')
fig.update_yaxes(tickformat=',.0f')
fig.update_yaxes(type='log')

fig.show()

3.3.1. Removing Free Transfers¶

In [122]:
# Filter: only valid transfer_fee and > 0 (exclude free)
series_all = pd.to_numeric(df_tm_transfers_cleaned['transfer_fee'], errors='coerce').dropna()
series = series_all[series_all > 0]

if series.empty:
    raise ValueError("There are no valid values in 'transfer_fee' > 0 (excluding free).")

n_free = (series_all == 0).sum()

# Statistics (excluding free)
mean_val = series.mean()
median_val = series.median()

# 3) Bineado manual para colorear por intensidad
nbins = 40
counts, bin_edges = np.histogram(series, bins=nbins)
bin_left = bin_edges[:-1]
bin_right = bin_edges[1:]
bin_center = (bin_left + bin_right) / 2

df_bins = pd.DataFrame({
    'bin_left': bin_left,
    'bin_right': bin_right,
    'bin_center': bin_center,
    'count': counts
})

# Interactive plot
fig = px.bar(
    df_bins,
    x='bin_center',
    y='count',
    color='count',
    text='count',
    title='Distribution of Transfer Fees - Excluding Free Transfers (Log Scale)',
    labels={'bin_center': 'Transfer Fee (€)', 'count': 'Count'},
    hover_data={'bin_left': ':.0f', 'bin_right': ':.0f', 'bin_center': ':.0f', 'count': ':.0f'}
)

fig.update_traces(textposition='outside', cliponaxis=False)

# Style
fig.update_layout(
    template='plotly_dark',
    xaxis_title='Transfer Fee (€)',
    yaxis_title='Count',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    legend=dict(orientation='v', yanchor='top', y=1, xanchor='left', x=0)
)

fig.update_xaxes(tickformat=',.0f')
fig.update_yaxes(tickformat=',.0f')
fig.update_yaxes(type='log')

# Add note with number of free excluded
if n_free > 0:
    fig.add_annotation(
        x=1, y=1.12, xref='paper', yref='paper',
        text=f"Free transfers excluded: {n_free:,}",
        showarrow=False, align='right'
    )

fig.show()

3.4. Correlation between Market Value and Transfer Fees¶

In [123]:
# Filter and group by year
df_yearly = (
    df_tm_transfers_cleaned
      .loc[
          (df_tm_transfers_cleaned['transfer_fee'] > 0)
      ]
      .dropna(subset=['transfer_fee', 'market_value_in_eur', 'transfer_date'])
      .assign(year=lambda d: d['transfer_date'].dt.year)
      .loc[lambda d: d['year'] >= 2008]
      .groupby('year', as_index=False)
      .agg({
          'market_value_in_eur': 'mean',
          'transfer_fee': 'mean'
      })
)

# Create figure with two lines
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_yearly['year'],
    y=df_yearly['market_value_in_eur'],
    mode='lines+markers',
    name='Avg Market Value',
    line=dict(color='royalblue', width=2)
))

fig.add_trace(go.Scatter(
    x=df_yearly['year'],
    y=df_yearly['transfer_fee'],
    mode='lines+markers',
    name='Avg Transfer Fee',
    line=dict(color='orange', width=2)
))

# Style the graph
fig.update_layout(
    title='Average Market Value vs Transfer Fee by Year (Excluding Free Transfers)',
    xaxis_title='Year',
    yaxis_title='Average Value (€)',
    template='plotly_dark',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    xaxis=dict(
        tickmode='linear',
        dtick=1,
        tickangle=55
    )
)

fig.show()

3.5. Evalution of Average Spend by Transfer by Year¶

In [124]:
df_f = (
    df_tm_transfers_cleaned
      .dropna(subset=['transfer_fee', 'transfer_date'])
      .loc[df_tm_transfers_cleaned['transfer_fee'] > 0]
      .assign(year=lambda d: d['transfer_date'].dt.year)
      .loc[lambda d: d['year'] >= 2008]
)

# Grouping by year
yearly = (
    df_f.groupby('year', as_index=False)
        .agg(
            avg_fee=('transfer_fee', 'mean'),
            n_transfers=('transfer_fee', 'size')
        )
        .sort_values('year')
)

# Optional: 3-year trailing average for the line
yearly['avg_fee_roll3'] = yearly['avg_fee'].rolling(3, center=False, min_periods=1).mean()

# Figure with double axis: line (avg) + bars (count)
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Bars: number of transfers (secondary axis)
fig.add_trace(
    go.Bar(
        x=yearly['year'],
        y=yearly['n_transfers'],
        name='Transfers',
        opacity=0.35,
        hovertemplate='Year: %{x}<br>Transfers: %{y}<extra></extra>'
    ),
    secondary_y=True
)

# Main line: average transfer fee
fig.add_trace(
    go.Scatter(
        x=yearly['year'],
        y=yearly['avg_fee'],
        mode='lines+markers',
        name='Avg Transfer Fee',
        line=dict(width=3),
        marker=dict(size=7),
        hovertemplate='Year: %{x}<br>Avg Fee: €%{y:,.0f}<extra></extra>'
    ),
    secondary_y=False
)

# Optional: smoothed line (3Y)
fig.add_trace(
    go.Scatter(
        x=yearly['year'],
        y=yearly['avg_fee_roll3'],
        mode='lines',
        name='Avg Fee (3Y MA)',
        line=dict(dash='dash'),
        hovertemplate='Year: %{x}<br>3Y MA: €%{y:,.0f}<extra></extra>'
    ),
    secondary_y=False
)

# Peak annotation
idx_max = yearly['avg_fee'].idxmax()
fig.add_annotation(
    x=yearly.loc[idx_max, 'year'],
    y=yearly.loc[idx_max, 'avg_fee'],
    text=f"Peak €{yearly.loc[idx_max, 'avg_fee']:,.0f}",
    showarrow=True,
    arrowhead=2,
    yshift=10
)

# Style the graph
fig.update_layout(
    title='Average Transfer Fee per Year (Excluding Free Transfers)',
    template='plotly_dark',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    legend=dict(orientation='v', yanchor='top', y=1, xanchor='left', x=0),
    xaxis=dict(
        title='Year',
        tickmode='linear',
        dtick=1,
        tickangle=55
    ),
)

fig.update_yaxes(
    title_text='Avg Transfer Fee (€)',
    secondary_y=False
)
fig.update_yaxes(
    title_text='Transfers',
    secondary_y=True
)

fig.show()

3.6. Number of Transfers of more than 10 mill. € by Year¶

In [125]:
# Filter transfers over 10M from 2008
df_over_10m = (
    df_tm_transfers_cleaned
      .dropna(subset=['transfer_fee', 'transfer_date'])
      .loc[df_tm_transfers_cleaned['transfer_fee'] > 10_000_000]
      .assign(year=lambda d: d['transfer_date'].dt.year)
      .loc[lambda d: d['year'] >= 2008]
)

# Count by year
transfers_over_10m_by_year = (
    df_over_10m.groupby('year', as_index=False)
               .agg(n_transfers=('transfer_fee', 'size'))
               .sort_values('year')
)

# Calculate 3-year rolling average
transfers_over_10m_by_year['roll3'] = (
    transfers_over_10m_by_year['n_transfers']
    .rolling(3, center=True, min_periods=2)
    .mean()
)

# Bar graph with continuous colors
fig = px.bar(
    transfers_over_10m_by_year,
    x='year',
    y='n_transfers',
    color='n_transfers',  # <- color continuo según valor
    text='n_transfers',
    title='Number of Transfers > €10 mill. by Year',
    hover_data={'year': True}
)
fig.update_traces(textposition='outside', cliponaxis=False)

# Add trend line
fig.add_trace(go.Scatter(
    x=transfers_over_10m_by_year['year'],
    y=transfers_over_10m_by_year['roll3'],
    mode='lines+markers',
    name='3Y Rolling Avg',
    hovertemplate='Year: %{x}<br>3Y Rolling Avg: %{y:.0f}<extra></extra>'
))

# Style the graph
fig.update_layout(
    legend=dict(
        orientation='v',
        yanchor='top',
        y=1,
        xanchor='left',
        x=0
    ),
    template='plotly_dark',
    xaxis_title='Year',
    yaxis_title='Number of Transfers > €10 mill.',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    xaxis=dict(
        tickmode='array',
        tickvals=transfers_over_10m_by_year['year'], 
        ticktext=transfers_over_10m_by_year['year'],
        tickangle=55,
        rangeslider=dict(visible=False),
        rangeselector=dict(visible=False)
    ),
    coloraxis_colorbar=dict(title='Transfers')
)

fig.show()

3.7. Market Growth over the last 10 Years¶

In [126]:
# Base: year, cleaning and aggregations
df_base = (
    df_tm_transfers_cleaned
      .dropna(subset=['transfer_date'])
      .assign(year=lambda d: d['transfer_date'].dt.year)
)

# Count of transfers
counts = (
    df_base
      .groupby('year', as_index=True)
      .size()
      .rename('n_transfers')
      .sort_index()
)

# Money moved
money = (
    df_base
      .dropna(subset=['transfer_fee'])
      .groupby('year', as_index=True)['transfer_fee']
      .sum()
      .rename('total_fee')
      .sort_index()
)

# Ensure continuity of years (fill with 0)
all_years = pd.Index(range(min(counts.index.min(), money.index.min()),
                           max(counts.index.max(), money.index.max())+1), name='year')
counts = counts.reindex(all_years, fill_value=0)
money  = money.reindex(all_years,  fill_value=0)

# Window: last 10 years available
end_year = int(all_years.max())
start_year = max(int(all_years.min()), end_year - 9)

counts_10 = counts.loc[start_year:end_year]
money_10  = money.loc[start_year:end_year]

years_diff = end_year - start_year  # number of years between extremes (for CAGR)

def safe_cagr(end_val, start_val, years):
    if years <= 0:
        return np.nan
    if start_val <= 0:
        return np.nan  # CAGR not defined if the start is 0 or negative
    return (end_val / start_val) ** (1/years) - 1

cagr_counts = safe_cagr(counts_10.iloc[-1], counts_10.iloc[0], years_diff)
cagr_money  = safe_cagr(money_10.iloc[-1],  money_10.iloc[0],  years_diff)

# Graph 1: Number of transfers and CAGR
fig1 = px.bar(
    counts_10.reset_index(),
    x='year',
    y='n_transfers',
    color='n_transfers',
    text='n_transfers',
    title=f'Number of Transfers (Including Free Transfers) per Year — CAGR {("" if pd.notna(cagr_counts) else "N/A") if pd.isna(cagr_counts) else f"{cagr_counts*100:.1f}%"} ({start_year}–{end_year})',
    hover_data={'year': True}
)
fig1.update_traces(textposition='outside', cliponaxis=False)
fig1.update_layout(
    template='plotly_dark',
    xaxis_title='Year',
    yaxis_title='Transfers',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    xaxis=dict(tickmode='array',
               tickvals=counts_10.index,
               ticktext=counts_10.index.astype(str),
               tickangle=55),
    coloraxis_colorbar=dict(title='Transfers')
)

fig1.show()

# Graph 2: Money moved and CAGR
fig2 = px.bar(
    money_10.reset_index(),
    x='year',
    y='total_fee',
    color='total_fee',
    text=money_10.reset_index()['total_fee'].map(lambda v: f"€{v:,.0f}"),
    title=f'Total Money Moved per Year — CAGR {("" if pd.notna(cagr_money) else "N/A") if pd.isna(cagr_money) else f"{cagr_money*100:.1f}%"} ({start_year}–{end_year})',
    hover_data={'year': True}
)
fig2.update_traces(textposition='outside', cliponaxis=False)
fig2.update_layout(
    template='plotly_dark',
    xaxis_title='Year',
    yaxis_title='Aggregated Fees',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    xaxis=dict(tickmode='array',
               tickvals=money_10.index,
               ticktext=money_10.index.astype(str),
               tickangle=55),
    coloraxis_colorbar=dict(title='Fees')
)

fig2.show()

4. Club Analysis¶

4.1. Average Spending by Club¶

In [127]:
# Filtering
df_club_spending = (
    df_tm_transfers_cleaned
      .dropna(subset=['transfer_fee', 'transfer_date', 'to_club_id'])
      .assign(year=lambda d: d['transfer_date'].dt.year)
      .loc[lambda d: d['year'] >= 2008]
)

# Total spending per club and year
club_year_spending = (
    df_club_spending
      .groupby(['year', 'to_club_id'], as_index=False)
      .agg(total_spent=('transfer_fee', 'sum'))
)

# Average spending per club per year
avg_spending_per_club = (
    club_year_spending
      .groupby('year', as_index=False)
      .agg(avg_spent_per_club=('total_spent', 'mean'))
)

# Calculate CAGR
start_year = int(avg_spending_per_club['year'].min())
end_year   = int(avg_spending_per_club['year'].max())
years_diff = end_year - start_year
start_val  = avg_spending_per_club.loc[avg_spending_per_club['year'] == start_year, 'avg_spent_per_club'].iloc[0]
end_val    = avg_spending_per_club.loc[avg_spending_per_club['year'] == end_year, 'avg_spent_per_club'].iloc[0]

if start_val > 0:
    cagr = (end_val / start_val) ** (1 / years_diff) - 1
else:
    cagr = np.nan

# Interactive graph
fig = px.bar(
    avg_spending_per_club,
    x='year',
    y='avg_spent_per_club',
    color='avg_spent_per_club',
    text=avg_spending_per_club['avg_spent_per_club'].map(lambda v: f"€{v:,.0f}"),
    title=f'Average Spending per Club by Year — CAGR {cagr*100:.1f}%',
    hover_data={'year': True}
)

fig.update_traces(textposition='outside', cliponaxis=False)

fig.update_layout(
    template='plotly_dark',
    xaxis_title='Year',
    yaxis_title='Average Spending per Club',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    xaxis=dict(
        tickmode='array',
        tickvals=avg_spending_per_club['year'],
        ticktext=avg_spending_per_club['year'],
        tickangle=55
    ),
    coloraxis_colorbar=dict(title='Spending (€)')
)

fig.show()

4.2. Percentage of Profitable Clubs by Season¶

In [128]:
# Base: year, cleaning and aggregations
df_base = (
    df_tm_transfers_cleaned
      .dropna(subset=['transfer_fee', 'transfer_date'])
      .assign(year=lambda d: d['transfer_date'].dt.year)
      .loc[lambda d: d['year'] >= 2008]
)

# Income by club-year (club as seller)
income_by_club_year = (
    df_base
      .groupby(['year', 'from_club_id'], as_index=False)
      .agg(total_income=('transfer_fee', 'sum'))
      .rename(columns={'from_club_id': 'club_id'})
)

# Spend by club-year (club as buyer)
spend_by_club_year = (
    df_base
      .groupby(['year', 'to_club_id'], as_index=False)
      .agg(total_spent=('transfer_fee', 'sum'))
      .rename(columns={'to_club_id': 'club_id'})
)

# Join income and spend
club_balance = (
    pd.merge(income_by_club_year, spend_by_club_year,
             on=['year', 'club_id'], how='outer')
      .fillna(0)
)

# Flag of profitable
club_balance['profitable'] = club_balance['total_income'] > club_balance['total_spent']

# Percentage of profitable clubs per year
profitability_by_year = (
    club_balance
      .groupby('year', as_index=False)
      .agg(
          pct_profitable=('profitable', lambda x: 100 * x.mean()),
          n_clubs=('club_id', 'nunique')
      )
      .sort_values('year')
)

# Graph
fig = px.bar(
    profitability_by_year,
    x='year',
    y='pct_profitable',
    color='pct_profitable',
    text=profitability_by_year['pct_profitable'].map(lambda v: f"{v:.1f}%"),
    title='Percentage of Profitable Clubs by Year',
    hover_data={'year': True, 'n_clubs': True}
)

fig.update_traces(textposition='outside', cliponaxis=False)

fig.update_layout(
    template='plotly_dark',
    xaxis_title='Year',
    yaxis_title='Profitable Clubs',
    hovermode='x unified',
    margin=dict(l=40, r=20, t=60, b=80),
    xaxis=dict(
        tickmode='array',
        tickvals=profitability_by_year['year'],
        ticktext=profitability_by_year['year'],
        tickangle=55
    ),
    coloraxis_colorbar=dict(title='% Profitable')
)

fig.show()

4.3. Percentage of Profitable Clubs of the last Decade¶

In [129]:
# Base: year, cleaning and aggregations
df_base = (
    df_tm_transfers_cleaned
      .dropna(subset=['transfer_fee', 'transfer_date'])
      .assign(year=lambda d: d['transfer_date'].dt.year)
      .loc[lambda d: d['year'] >= 2015]
)

# Total income by club
income_by_club = (
    df_base
      .groupby('from_club_id', as_index=False)
      .agg(total_income=('transfer_fee', 'sum'))
      .rename(columns={'from_club_id': 'club_id'})
)

# Total spend by club
spend_by_club = (
    df_base
      .groupby('to_club_id', as_index=False)
      .agg(total_spent=('transfer_fee', 'sum'))
      .rename(columns={'to_club_id': 'club_id'})
)

# Join income and spend
club_balance_total = (
    pd.merge(income_by_club, spend_by_club,
             on='club_id', how='outer')
      .fillna(0)
)

# Flag of profitable
club_balance_total['profitable'] = club_balance_total['total_income'] > club_balance_total['total_spent']

# Data for donut
values = [
    club_balance_total['profitable'].sum(),
    (~club_balance_total['profitable']).sum()
]
labels = ['Profitable', 'Not Profitable']

# Donut graph with custom colors
fig = px.pie(
    names=labels,
    values=values,
    title='Percentage of Profitable Clubs of the last Decade (2015-2024)',
    hole=0.5,
    color=labels,
    color_discrete_map={
        'Profitable': '#f48c45',
        'Not Profitable': '#5202a2'
    }
)

# Style and text
fig.update_traces(
    textinfo='label+percent',
    textposition='inside',
    insidetextorientation='horizontal',
    insidetextfont=dict(color='white'),
    outsidetextfont=dict(color='white'),
    texttemplate='%{label}<br>%{percent}',
    marker=dict(line=dict(color='#000000', width=2))
)

fig.update_layout(
    template='plotly_dark',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.15,
        xanchor='center',
        x=0.5
    ),
    margin=dict(l=40, r=40, t=60, b=40)
)

fig.show()